library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster
Attaching package: ‘fastcluster’
The following object is masked from ‘package:stats’:
hclust
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
method from
print.paged_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘WGCNA’
The following object is masked from ‘package:stats’:
cor
library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
options(stringsAsFactors = FALSE)
load sample info
sample.description <- read.csv("../output/sample.description.sporophyte.all.csv")
load reads
lcpm <- read.csv("../output/sporophyte_combined_log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 29409 109
Filter for genes with the highest coefficient of variation
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1
0.1437800 0.5919531
quantile(CV, 0.25)
25%
0.1323597
LFY2 has a pretty low CV; have to include 25% of the genes by CV to include it.
lcpm.filter <- lcpm[CV > quantile(CV, 0.25),]
dim(lcpm.filter)
[1] 22056 109
WGCNA wants genes in columns
lcpm.filter.t <- t(lcpm.filter)
Soft thresholding
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 20000)
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 20000 of 22056
Warning: executing %dopar% sequentially: no parallel backend registered
..working on genes 20001 through 22056 of 22056
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
choose 5
softPower <- 5
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
define modules
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit <- 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
4035 2424 1505 1341 1325 1227 991 810 663 648 595 539 535 496 485 435 435
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
416 332 325 306 275 249 248 213 204 196 184 172 165 91 75 70 46
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
black blue brown cyan darkgreen
991 2424 1505 496 275
darkgrey darkmagenta darkolivegreen darkorange darkred
248 46 70 204 306
darkturquoise green greenyellow grey60 lightcyan
249 1325 595 435 435
lightgreen lightyellow magenta midnightblue orange
416 332 663 485 213
paleturquoise pink purple red royalblue
91 810 648 1227 325
saddlebrown salmon skyblue steelblue tan
172 535 184 165 539
turquoise violet white yellow
4035 75 196 1341
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
merge similar modules
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
merge with correlation > 0.8
MEDissThres = 0.2
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.2
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 34 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 23 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 23 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
compare pre and post merge
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs
table(merge$colors)
black blue brown cyan darkgreen
1195 6459 1505 496 710
darkgrey darkmagenta darkolivegreen darkred darkturquoise
248 46 70 306 249
green lightcyan lightgreen lightyellow midnightblue
2135 970 1643 516 485
orange paleturquoise royalblue saddlebrown tan
213 739 490 172 1797
violet white yellow
75 196 1341
length(table(merge$colors))
[1] 23
median(table(merge$colors))
[1] 496
Which module is LFY in?
CrLFY1 <- "Ceric.33G031700"
CrLFY2 <- "Ceric.18G076300"
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)
module.assignment %>%
filter(str_detect(geneID, "18G076300|33G031700"))
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
Plot eigengenes
Make sure sample info sheet is in the correct order.
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[17] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[33] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[65] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[81] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen.mean <- sample.eigen %>%
group_by(group) %>%
summarize(across(starts_with("ME"), mean),
label=unique(label))
sample.eigen %>%
pivot_longer(starts_with("ME"), names_to = "ME") %>%
ggplot(aes(y=label, x = value)) +
geom_point() +
facet_wrap(~ME, ncol=5) +
theme(axis.text.y = element_text(size = 6))
A heat map:
sample.eigen.mean$group %>% sort
[1] "CrLFY1-OX-S5"
[2] "CrLFY1-OX-WYS"
[3] "CrLFY2-OX-S5"
[4] "CrLFY2-OX-WYS"
[5] "Cross-S5"
[6] "Cross-WYS"
[7] "PRJEB33372-frond"
[8] "PRJNA1149654-whole sporophyte DMSO"
[9] "PRJNA1149654-whole sporophyte IAA"
[10] "PRJNA555841-30 day whole sporophyte 24h 2-4D"
[11] "PRJNA555841-30 day whole sporophyte 38h 2-4D"
[12] "PRJNA555841-30 Day whole sporophyte mock"
[13] "PRJNA578676-leaf tip"
[14] "PRJNA578676-leaf-no tip"
[15] "PRJNA578676-root tip"
[16] "PRJNA578676-root-no tip"
[17] "PRJNA578676-shoot tip"
[18] "PRJNA651764-developing leaf"
[19] "PRJNA651765-expanding leaf"
[20] "PRJNA651766-sterile leaf"
[21] "PRJNA651767-fertile leaf"
[22] "PRJNA651768-sori"
[23] "PRJNA651771-stem"
[24] "PRJNA651772-root"
[25] "PRJNA651773-young sporophyte"
[26] "PRJNA666635-0mins-aba-high-br"
[27] "PRJNA666635-0mins-dry-high-br"
[28] "PRJNA666635-0mins-wet-high-br"
[29] "PRJNA666635-60mins-abamock-low-br"
[30] "PRJNA666635-60mins-dryaba-high-br"
[31] "PRJNA666635-60mins-dryaba-low-br"
[32] "PRJNA666635-60mins-drymock-high-br"
[33] "PRJNA666635-60mins-drymock-low-br"
[34] "PRJNA666635-60mins-wetmock-low-br"
[35] "PRJNA681601-Cr_BAM"
[36] "PRJNA681601-Cr_Callus"
[37] "PRJNA681601-Cr_Leaf"
[38] "PRJNA681601-Cr_Root"
[39] "PRJNA681601-Cr_SAM"
[40] "PRJNA857489-differentiated root"
[41] "PRJNA857489-root tip"
[42] "PRJNA857489-shoot"
[43] "RNAi-S5"
[44] "RNAi-WYS"
[45] "WT-S5"
[46] "WT-WYS"
sample.eigen.mean$label %>% sort
[1] "0mins-aba-high-br" "0mins-dry-high-br"
[3] "0mins-wet-high-br" "30 day whole sporophyte 24h 2-4D"
[5] "30 day whole sporophyte 38h 2-4D" "30 Day whole sporophyte mock"
[7] "60mins-abamock-low-br" "60mins-dryaba-high-br"
[9] "60mins-dryaba-low-br" "60mins-drymock-high-br"
[11] "60mins-drymock-low-br" "60mins-wetmock-low-br"
[13] "Cr_BAM" "Cr_Callus"
[15] "Cr_Leaf" "Cr_Root"
[17] "Cr_SAM" "CrLFY1-OX-S5"
[19] "CrLFY1-OX-WYS" "CrLFY2-OX-S5"
[21] "CrLFY2-OX-WYS" "Cross-S5"
[23] "Cross-WYS" "developing leaf"
[25] "differentiated root" "expanding leaf"
[27] "fertile leaf" "frond"
[29] "leaf tip" "leaf-no tip"
[31] "RNAi-S5" "RNAi-WYS"
[33] "root" "root tip"
[35] "root tip" "root-no tip"
[37] "shoot" "shoot tip"
[39] "sori" "stem"
[41] "sterile leaf" "whole sporophyte DMSO"
[43] "whole sporophyte IAA" "WT-S5"
[45] "WT-WYS" "young sporophyte"
MEs.m <- sample.eigen.mean %>% mutate(label=make.names(label, unique = TRUE)) %>% column_to_rownames("label") %>% select(starts_with("ME")) %>% as.matrix()
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_sporophyte_all.Rdata")